/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the LICENSE file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_Analyzers_
#define _lucene_analysis_Analyzers_

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "../util/Reader.hpp"
#include "AnalysisHeader.hpp"
//#include "../util/Misc.hpp"

CL_NS_DEF(analysis)

/** An abstract base class for simple, character-oriented tokenizers.*/
class CharTokenizer: public Tokenizer
{
private:
	int32_t offset, bufferIndex, dataLen;
	UChar buffer[LUCENE_MAX_WORD_LEN+1];
	const UChar *ioBuffer;
protected:

	/**
	* Returns true if a character should be included in a token. This
	* tokenizer generates as tokens adjacent sequences of characters which
	* satisfy this predicate. Characters for which this is false are used to
	* define token boundaries and are not included in tokens.
	*/
	virtual bool isTokenChar(const UChar c) const = 0;

	/**
	* Called on each token character to normalize it before it is added to the
	* token. The default implementation does nothing. Subclasses may use this
	* to, e.g., lowercase tokens.
	*/
	virtual UChar normalize(const UChar c) const;

public:
	CharTokenizer(CL_NS(util)::Reader* in);
	virtual ~CharTokenizer() {
	}
	bool next(Token* token);
};


/**
A LetterTokenizer is a tokenizer that divides text at non-letters. That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.

Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces.
*/
class LetterTokenizer: public CharTokenizer
{
public:
	// Construct a new LetterTokenizer.
	LetterTokenizer(CL_NS(util)::Reader* in):
			CharTokenizer(in) {}

	~LetterTokenizer() {}
protected:
	/** Collects only characters which satisfy _istalpha.*/
	bool isTokenChar(const UChar c) const;
};



/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together. It divides text at non-letters and converts
* them to lower case. While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
*/
class LowerCaseTokenizer: public LetterTokenizer
{
public:
	/** Construct a new LowerCaseTokenizer. */
	LowerCaseTokenizer(CL_NS(util)::Reader* in):
			LetterTokenizer(in) {}

	~LowerCaseTokenizer() {}
protected:
	/** Collects only characters which satisfy _totlower. */
	UChar normalize(const UChar chr) const;
};


/**
 * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
 * Adjacent sequences of non-Whitespace characters form tokens.
 */
class WhitespaceTokenizer: public CharTokenizer
{
public:
	/** Construct a new WhitespaceTokenizer. */
	WhitespaceTokenizer(CL_NS(util)::Reader* in): CharTokenizer(in) {}
	~WhitespaceTokenizer() {}
protected:
	/** Collects only characters which do not satisfy _istspace.
	*/
	bool isTokenChar(const UChar c) const;
};


/** An Analyzer that uses LetterTokenizer. */
class LetterAnalyzer: public Analyzer
{
public:
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);
	~LetterAnalyzer() {}
};

/** An Analyzer that uses WhitespaceTokenizer. */
class WhitespaceAnalyzer: public Analyzer
{
public:
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);
	~WhitespaceAnalyzer() {}
};

/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
class SimpleAnalyzer: public Analyzer
{
public:
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);
	~SimpleAnalyzer() {}
};


/**
* Normalizes token text to lower case.
*/
class LowerCaseFilter: public TokenFilter
{
public:
	LowerCaseFilter(TokenStream* in, bool deleteTokenStream): TokenFilter(in, deleteTokenStream) {}
	~LowerCaseFilter() {}
	bool next(Token* token);
};


/*
 * Removes stop words from a token stream.
 */
/*
class StopFilter: public TokenFilter
{
private:
	//bvk: i found this to work faster with a non-hash table. the number of items
	//in the stop table is not like to make it worth having hashing.
	CL_NS(util)::CLSetList<const UChar*>* table;
public:
	// Constructs a filter which removes words from the input
	//	TokenStream that are named in the array of words.
	StopFilter(TokenStream* in, bool deleteTokenStream, const UChar** stopWords);

	~StopFilter() {}

	// Constructs a filter which removes words from the input
	//	TokenStream that are named in the CLSetList.
	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const UChar*>* stopTable):
			TokenFilter(in, deleteTokenStream),
			table(stopTable) {}

	// Builds a Hashtable from an array of stop words, appropriate for passing
	// into the StopFilter constructor. This permits this table construction to
	// be cached once when an Analyzer is constructed.
	// Note: the stopWords list must be a static list because the strings are not copied
	static void fillStopTable(CL_NS(util)::CLSetList<const UChar*>* stopTable,
	                          const UChar** stopWords);

	// Returns the next input Token whose termText() is not a stop word.
	bool next(Token* token);
};
*/


/* Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
/*
class StopAnalyzer: public Analyzer
{
	CL_NS(util)::CLSetList<const UChar*> stopTable;

public:
	// Builds an analyzer which removes words in ENGLISH_STOP_WORDS.
	StopAnalyzer();
	~StopAnalyzer();

	// Builds an analyzer which removes words in the provided array.
	StopAnalyzer(const UChar** stopWords);
	// Filters LowerCaseTokenizer with StopFilter.
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);

	// An array containing some common English words that are not usually useful for searching.
	static const UChar *ENGLISH_STOP_WORDS[];
};
*/


/*
 * This analyzer is used to facilitate scenarios where different
 * fields require different analysis techniques. Use {@link #addAnalyzer}
 * to add a non-default analyzer on a field name basis.
 *
 * <p>Example usage:
 *
 * <pre>
 *   PerFieldAnalyzerWrapper aWrapper =
 *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
 *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
 *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
 * </pre>
 *
 * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
 * and "lastname", for which KeywordAnalyzer will be used.
 *
 * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
 * and query parsing.
 */
/*
class PerFieldAnalyzerWrapper : public Analyzer
{
private:
	Analyzer* defaultAnalyzer;
	CL_NS(util)::CLHashMap < const UChar*, Analyzer*, CL_NS(util)::Compare::UChar,
	CL_NS(util)::Equals::UChar, CL_NS(util)::Deletor::tcArray, CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
public:
	// Constructs with default analyzer.
	// @param defaultAnalyzer Any fields not specifically
	// defined to use a different analyzer will use the one provided here.
	PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
	~PerFieldAnalyzerWrapper();

	// Defines an analyzer to use for the specified field.
	// @param fieldName field name requiring a non-default analyzer
	// @param analyzer non-default analyzer to use for field
	void addAnalyzer(const UChar *fieldName, Analyzer* analyzer);
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);
};
*/

/**
 * A filter that replaces accented characters in the ISO Latin 1 character set
 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
 * <p>
 * For instance, '&agrave;' will be replaced by 'a'.
 * <p>
 */
class ISOLatin1AccentFilter: public TokenFilter
{
public:
	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
			TokenFilter(input, deleteTs) {
	}

	/**
	 * To replace accented characters in a String by unaccented equivalents.
	 */
	bool next(Token* token);
};


/**
 * Emits the entire input as a single token.
 */
class KeywordTokenizer: public Tokenizer
{
private:
	LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
	bool done;
	int bufferSize;
public:
	KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize = -1);
	virtual ~KeywordTokenizer();
	bool next(Token* token);
};

/**
 * "Tokenizes" the entire stream as a single token. This is useful
 * for data like zip codes, ids, and some product names.
 */
class KeywordAnalyzer: public Analyzer
{
public:
	TokenStream* tokenStream(const UChar *fieldName, CL_NS(util)::Reader* reader);
	virtual ~KeywordAnalyzer() {}
};


/**
 * Removes words that are too long and too short from the stream.
 *
 */
class LengthFilter: public TokenFilter
{
private:
	int _min;
	int _max;
public:
	/**
	* Build a filter that removes words that are too long or too
	* short from the text.
	*/
	LengthFilter(TokenStream* in, int _min, int _max);

	/**
	* Returns the next input Token whose termText() is the right len
	*/
	bool next(Token* token);
};


CL_NS_END
#endif
